#Executive Summary
library(ggplot2)
library(plyr)
library(dplyr)
library(formattable)
library(caret)
library(knitr)
library(kableExtra)
library(plotly)
library(corrplot)
library(ggthemes)
library(doParallel)
library(parallel)
library(corrplot)
library(purrr)
library(plotly)
CSData <- read.csv("CaseStudy2-data.csv", header = TRUE)
# Average Age
mean(CSData$Age)
## [1] 36.82874
# Average years at company
mean(CSData$YearsAtCompany)
## [1] 6.962069
# average Job satisfaction
mean(CSData$JobSatisfaction)
## [1] 2.709195
# Average years with manager
mean(CSData$YearsWithCurrManager)
## [1] 4.14023
# Average years between promotion
mean(CSData$YearsSinceLastPromotion)
## [1] 2.168966
#Distance from home
mean(CSData$DistanceFromHome)
## [1] 9.33908
#Environment Satisfaction
mean(CSData$EnvironmentSatisfaction)
## [1] 2.701149
#Relationship satisfaction
mean(CSData$RelationshipSatisfaction)
## [1] 2.706897
#Performance Rating
mean(CSData$PerformanceRating)
## [1] 3.151724
#Total Working Years
mean(CSData$TotalWorkingYears)
## [1] 11.05287
# Pie chart of percentage of male and female employees almost 60% of workforce is male
library(plotly)
plot_ly(data = CSData, labels = CSData$Gender, values = plyr::count(CSData$Gender), type = "pie", title = "Gender")
# Pie Chart of rob roles and percentage of workforce
library(plotly)
plot_ly(data = CSData, labels = CSData$JobRole, values = plyr::count(CSData$JobRole), type = "pie", title = "Job Roles")
# Over view of Eduction
mean(CSData$Education)
## [1] 2.901149
plot_ly(data = CSData, labels = CSData$EducationField, values = plyr::count(CSData$EducationField), type = "pie", title = "Education Field")
## Analysis job roles and turnover
#Get Percentage of Roles and turnover
RoleTotal <- plyr::count(CSData$JobRole)
#Format columns
names(RoleTotal)[1] <- "JobRole"
names(RoleTotal)[2] <- "Total"
RoleTotal$JobRole <- as.character(RoleTotal$JobRole)
#Only employees with attrition
AttrY <- dplyr::filter(CSData, CSData$Attrition == "Yes")
#Count of number of employees in role who quit
AttrRole <- plyr::count(AttrY$JobRole)
#Format Columns Attr Role
names(AttrRole)[1] <- "JobRole"
names(AttrRole)[2] <- "Attr"
AttrRole$JobRole <- as.character(AttrRole$JobRole)
#Merge RoleTotal with AttrRole
Role_Attr_Total <- merge(RoleTotal,AttrRole)
# Add a column that is the calculated percentage of Job Role turnover
Role_Attr_Total <- mutate(Role_Attr_Total, Attr_Percent = Attr / Total)
#Format the Attr_Percent column to Percentage
Role_Attr_Total$Attr_Percent <- percent(Role_Attr_Total$Attr_Percent)
#Arrange the data in Descending Order by Attribution Percentage
Role_Attr_Total <- Role_Attr_Total %>% arrange(desc(Attr_Percent))
kable(Role_Attr_Total) %>% kable_styling() %>% column_spec(4, bold = TRUE)
| JobRole | Total | Attr | Attr_Percent |
|---|---|---|---|
| Sales Representative | 53 | 24 | 45.28% |
| Human Resources | 27 | 6 | 22.22% |
| Laboratory Technician | 153 | 30 | 19.61% |
| Research Scientist | 172 | 32 | 18.60% |
| Sales Executive | 200 | 33 | 16.50% |
| Healthcare Representative | 76 | 8 | 10.53% |
| Manager | 51 | 4 | 7.84% |
| Manufacturing Director | 87 | 2 | 2.30% |
| Research Director | 51 | 1 | 1.96% |
#Plot of Attrition in each Job role
p <- ggplot(data = CSData, aes(JobRole, fill = (Attrition == "Yes")))
p + geom_bar() + coord_flip() + labs(title = "Attrition in each Job Role", x = "Job Role", y = "Number of Employees") +
scale_fill_hc(name = "Attrition", labels = c("Total Employees", "Attrition") ) + theme_linedraw()
# Sales Rep Attrition
SalesReps <- filter(CSData, JobRole == "Sales Representative")
AttrYSales <- filter(AttrY, JobRole == "Sales Representative")
# Average years at the company. Sales Reps that leave on average are at the company less than 2.5 years.
mean(SalesReps$YearsAtCompany)
## [1] 2.924528
mean(AttrYSales$YearsAtCompany)
## [1] 2.375
# Most sales reps that stay with the company are 30 and over
plot(SalesReps$Attrition, SalesReps$Age, xlab = "Attrition", ylab = "Age", main = "Sales Rep Attrition and Age")
# Job Satisfaction Ratings for Sales Reps with Attrition is never above a 3.0 and averages 2.5
plot(SalesReps$Attrition, SalesReps$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Sales Rep Attrition and Job Satisfaction")
# Turnover and Distance from home in general sales reps that leave the company live farther away then the 5mile average that sales reps who stay live
plot(SalesReps$Attrition, SalesReps$DistanceFromHome, xlab = "Attrition", ylab = "Distance from home", main = "Sales Rep Attrition and Distance from Home")
#The reps that leave are disproportionatly Single
plot(SalesReps$Attrition, SalesReps$MaritalStatus, xlab = "Attrition", ylab = "Marriage Status", main = "Sales Rep Attrition and Marital Status")
#Filter on Human Resources
HR <- filter(CSData, JobRole == "Human Resources")
# HR Rep turnover average age is less thatn 30
plot(HR$Attrition, HR$Age, xlab = "Attrition", ylab = "Age", main = "Human Resources Attrition and Age")
# HR Reps that left company lived much farther from work on average 20 miles away
plot(HR$Attrition, HR$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Human Resources Attrition and Distance from home")
#Job Satisfaction of those that left averaged 2
plot(HR$Attrition, HR$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Human Resources Attrition and Job Satisfaction")
# Total working years of those that stayed averaged 7 years those that quite averaged 2
plot(HR$Attrition, HR$TotalWorkingYears, xlab = "attrition", ylab = "Job Satisfaction", main = "Human Resources Atrition and Total Working Years")
# Those that stay have been with the company an average of five years those that leave Less than two
plot(HR$Attrition, HR$YearsAtCompany, xlab = "Attrition", ylab = "Years At Company", main = "Human Rsources Attrition and Years at the Company")
# Lab Techs
LabTech <- filter(CSData, CSData$JobRole == "Laboratory Technician")
#Lab Techs that quit had a much lower Environment Satisfaction than the averge of 3
plot(LabTech$Attrition, LabTech$EnvironmentSatisfaction, xlab = "Attrition", ylab = "Environment Satisfaction", main = "Lab Tech Attrition and Environment Satisfaction")
#Lab Tech attrition and Age
plot(LabTech$Attrition, LabTech$Age, xlab = "Attrition", ylab = "Age", main = "Lab Tech Attrition and Age")
#Lab tech and distance from home
plot(LabTech$Attrition, LabTech$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Lab Tech Attrition and Distance from home")
#Lab tech and total working years
plot(LabTech$Attrition, LabTech$TotalWorkingYears, xlab = "Attrition", ylab = "Total Working Years", main = "Lab Tech Attrition and Total working years")
# Single lab techs are significantly more likely to leave
plot(LabTech$Attrition, LabTech$MaritalStatus, xlab = "Attrition",ylab = "Marital Status",main = "Lab Tech Attrition and Marital Status" )
#Look at the proportion of each variable that influences attrition
#Step 1 remove data that is not going to be useful for finding attrition ID, Employee Number, Standard Hours, and Over18
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))
#Create a function that will create a plot for each variable
AttrPlot <- function(df, x, y){
ggplot(data = df, aes_string(x = x, fill = y)) +
geom_bar(alpha = .9, position = "fill") +
coord_flip() + labs(x = x, y = "Attrition") + theme_hc()+ scale_fill_hc()
}
yname <- "Attrition"
xname <- names(CSData_AttrUseful[-ncol(CSData_AttrUseful)])
lapply(xname, function(x) AttrPlot(df = CSData_AttrUseful, x = x, y = yname))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
#Remove columns that are not useful
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))
#Create training and test data
set.seed(8)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)
#Create the training observations for Attrition
AttrTrain <- CSData_AttrUseful[TrainObs,]
#Create the test Observations for Attrition
AttrTest <- CSData_AttrUseful[-TrainObs,]
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
#Check number of cores for parallel processing
parallel::detectCores() #4 cores detected on iMac used for study
## [1] 4
#Assign cores to run this training model
workers <- makeCluster(3L)
#Sets up workers to run training
registerDoParallel(workers)
#Fit the Naives Bayes model
fit.nb <- train(Attrition ~., data = AttrTrain, method = "nb", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
#Predictions based on Naives Bayes method
pred.nb <- predict(fit.nb, AttrTest)
#Summary of Naives Bayes predicions
summary(pred.nb)
## No Yes
## 214 134
#Confusion Matrix to assess model
confusionMatrix(pred.nb, AttrTest$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 195 19
## Yes 97 37
##
## Accuracy : 0.6667
## 95% CI : (0.6144, 0.716)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2102
##
## Mcnemar's Test P-Value : 8.724e-13
##
## Sensitivity : 0.6678
## Specificity : 0.6607
## Pos Pred Value : 0.9112
## Neg Pred Value : 0.2761
## Prevalence : 0.8391
## Detection Rate : 0.5603
## Detection Prevalence : 0.6149
## Balanced Accuracy : 0.6643
##
## 'Positive' Class : No
##
fit.knn <- train(Attrition ~., data = AttrTrain, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
#Predictions based on Naives Bayes method
pred.knn <- predict(fit.knn, AttrTest)
#Summary of Naives Bayes predicions
summary(pred.knn)
## No Yes
## 333 15
#Confusion Matrix to assess model
confusionMatrix(pred.knn, AttrTest$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 287 46
## Yes 5 10
##
## Accuracy : 0.8534
## 95% CI : (0.8119, 0.8889)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.2588
##
## Kappa : 0.2293
##
## Mcnemar's Test P-Value : 2.13e-08
##
## Sensitivity : 0.9829
## Specificity : 0.1786
## Pos Pred Value : 0.8619
## Neg Pred Value : 0.6667
## Prevalence : 0.8391
## Detection Rate : 0.8247
## Detection Prevalence : 0.9569
## Balanced Accuracy : 0.5807
##
## 'Positive' Class : No
##
#function to create corrolation heatmap
correlator <- function(df){
df %>%
keep(is.numeric) %>%
tidyr::drop_na() %>%
cor %>%
corrplot(addCoef.col = "white", number.digits = 2,
number.cex = .5, method = "square",
order = "hclust",
tl.srt = 45, tl.cex = .8)
}
correlator(CSData_AttrUseful)
# Create the training and test data for the Monthly Income models
set.seed(12)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)
#Create the training observations for Monthly Income
MITrain <- CSData_AttrUseful[TrainObs,]
#Create the test Observations for Monthly Income
MITest <- CSData_AttrUseful[-TrainObs,]
# Set the training method for the regression models
trainMethod2 <- trainControl(method = "repeatedcv", number = 25, repeats = 5)
# Fit lm model
fit.lm <- train(MonthlyIncome ~., data = MITrain, method = "lm", trControl = trainMethod2)
# Check RMSE of linear model
fit.lm
## Linear Regression
##
## 522 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times)
## Summary of sample sizes: 501, 501, 501, 500, 502, 501, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1088.389 0.9385737 853.8985
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Fit knn regression model
fit.knnreg <- train(MonthlyIncome ~., data = MITrain, method = "knn", trControl = trainMethod2)
# Check RMSE of knn regression model
fit.knnreg
## k-Nearest Neighbors
##
## 522 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times)
## Summary of sample sizes: 501, 499, 501, 502, 500, 501, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 4924.749 0.04703560 3749.325
## 7 4756.036 0.04941159 3620.643
## 9 4691.593 0.04871913 3606.306
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.